title: “Chen lab RNA-seq version 3 (S449 Samples redone by Novogene again)” output: html_notebook date: April 5, 2019 #human cell lines # si RNA knockdown experiment

library(pheatmap)
## Warning: package 'pheatmap' was built under R version 3.5.2
library(edgeR)
## Warning: package 'edgeR' was built under R version 3.5.2
## Loading required package: limma
library(DESeq2)
## Warning: package 'DESeq2' was built under R version 3.5.2
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
## 
##     clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
##     clusterExport, clusterMap, parApply, parCapply, parLapply,
##     parLapplyLB, parRapply, parSapply, parSapplyLB
## The following object is masked from 'package:limma':
## 
##     plotMA
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, append, as.data.frame, basename, cbind,
##     colMeans, colnames, colSums, dirname, do.call, duplicated,
##     eval, evalq, Filter, Find, get, grep, grepl, intersect,
##     is.unsorted, lapply, lengths, Map, mapply, match, mget, order,
##     paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind,
##     Reduce, rowMeans, rownames, rowSums, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which, which.max,
##     which.min
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
## 
##     expand.grid
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: DelayedArray
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
## 
##     anyMissing, rowMedians
## Loading required package: BiocParallel
## Warning: package 'BiocParallel' was built under R version 3.5.2
## 
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
## 
##     colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following objects are masked from 'package:base':
## 
##     aperm, apply
library(biomaRt)
library(ggplot2)
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] ggplot2_3.1.0               biomaRt_2.38.0             
##  [3] DESeq2_1.22.2               SummarizedExperiment_1.12.0
##  [5] DelayedArray_0.8.0          BiocParallel_1.16.5        
##  [7] matrixStats_0.54.0          Biobase_2.42.0             
##  [9] GenomicRanges_1.34.0        GenomeInfoDb_1.18.1        
## [11] IRanges_2.16.0              S4Vectors_0.20.1           
## [13] BiocGenerics_0.28.0         edgeR_3.24.3               
## [15] limma_3.38.3                pheatmap_1.0.12            
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.4.0             bit64_0.9-7            splines_3.5.1         
##  [4] Formula_1.2-3          assertthat_0.2.0       latticeExtra_0.6-28   
##  [7] blob_1.1.1             GenomeInfoDbData_1.2.0 progress_1.2.0        
## [10] yaml_2.2.0             RSQLite_2.1.1          pillar_1.3.1          
## [13] backports_1.1.3        lattice_0.20-38        glue_1.3.0            
## [16] digest_0.6.18          RColorBrewer_1.1-2     XVector_0.22.0        
## [19] checkmate_1.9.0        colorspace_1.3-2       htmltools_0.3.6       
## [22] Matrix_1.2-15          plyr_1.8.4             XML_3.98-1.16         
## [25] pkgconfig_2.0.2        genefilter_1.64.0      zlibbioc_1.28.0       
## [28] xtable_1.8-3           purrr_0.2.5            scales_1.0.0          
## [31] tibble_2.0.0           htmlTable_1.13.1       annotate_1.60.0       
## [34] withr_2.1.2            nnet_7.3-12            lazyeval_0.2.1        
## [37] survival_2.43-3        magrittr_1.5           crayon_1.3.4          
## [40] memoise_1.1.0          evaluate_0.12          foreign_0.8-71        
## [43] prettyunits_1.0.2      tools_3.5.1            data.table_1.11.8     
## [46] hms_0.4.2              stringr_1.3.1          munsell_0.5.0         
## [49] locfit_1.5-9.1         cluster_2.0.7-1        AnnotationDbi_1.44.0  
## [52] bindrcpp_0.2.2         compiler_3.5.1         rlang_0.3.1           
## [55] grid_3.5.1             RCurl_1.95-4.11        rstudioapi_0.9.0      
## [58] htmlwidgets_1.3        bitops_1.0-6           base64enc_0.1-3       
## [61] rmarkdown_1.11         gtable_0.2.0           DBI_1.0.0             
## [64] R6_2.3.0               gridExtra_2.3          knitr_1.21            
## [67] dplyr_0.7.8            bit_1.1-14             bindr_0.1.1           
## [70] Hmisc_4.1-1            stringi_1.2.4          Rcpp_1.0.0            
## [73] geneplotter_1.60.0     rpart_4.1-13           acepack_1.4.1         
## [76] tidyselect_0.2.5       xfun_0.4

Read in files. Add new samples that are technical replicates (S449).

filename = "/Users/tfriedrich/Downloads/readcounts.txt"
counts1_old = read.table(filename, header = T,stringsAsFactors = F) 
rownames(counts1_old) = counts1_old$Geneid
counts1_gene_annotation = (counts1_old[,1:6])
counts1_old = data.matrix(counts1_old[,7:ncol(counts1_old)])
colnames(counts1_old) = sub("_1_algn_Aligned", "", matrix(unlist(strsplit(colnames(counts1_old), "\\.")), byrow =T, ncol=6)[,3])

filename2 = "/Users/tfriedrich/Downloads/readcounts_newsamples_version3.txt"
counts1_new = read.table(filename2, header=T, stringsAsFactors = F)  
rownames(counts1_new) = counts1_new$Geneid   
counts1_new = data.matrix(counts1_new[,7:ncol(counts1_new)])
colnames(counts1_new) = sub("_1_algn_Aligned", "", matrix(unlist(strsplit(colnames(counts1_new), "\\.")), byrow =T, ncol=7)[,4])

length(which(rownames(counts1_old)==rownames(counts1_new)))
## [1] 58302
counts1 = data.frame(cbind( counts1_old, counts1_new))      
dim(counts1)
## [1] 58302    24

Filter out genes with low and high counts ( high counts might be mitochondrial genes)

keep = rowMeans(counts1) > 5 & rowMeans(counts1) < 5000 
counts2 = counts1[keep,]

Take the log (RPKM)

y <- DGEList(counts=counts2)
fpkm_log_matrix = rpkm (y, gene.length=counts1_gene_annotation[keep,"Length"], log = TRUE)

Dendrogram

hc2 <- hclust(stats::dist(t(fpkm_log_matrix), method="minkowski"), "ward.D2")
plot(hc2, lwd=4, lty=1, col="black", col.lab="red", xlab="Samples", main="Samples clustered by log2(FPKM+1) using Ward's clustering & Euclidean distance")

Some replicates don’t cluster (see FocusT replicates). Samples cluster by cell type. SS49 and SS475 are more similar than the other cell types.

S449 technical replicate showing batch effects.

Setup up design matrix for PCA plot

groupNamesPerSample = c(matrix (unlist(strsplit(colnames(counts2[,1:20]), "_")), byrow=T, ncol=4) [,1], colnames(counts2[,21:24]))
groupNamesPerSample = unlist( lapply (groupNamesPerSample, function(x) sub("S475", "", sub("S449", "", sub("PLC", "", sub("Focus", "", x))))) )
conditionFactor  <- factor(groupNamesPerSample)
subjectgroup = c(matrix (unlist(strsplit(colnames(counts2[,1:20]), "_")), byrow=T, ncol=4) [,1], colnames(counts2[,21:24]))
subjectgroup = unlist( lapply (subjectgroup, function(x) sub("SC", "", sub("YT", "", sub("T", "", sub("Y", "", x))))) )
Subject <- factor(subjectgroup)

PCA plot

cold <- data.frame("conditionFactor"=conditionFactor, "Subject"=Subject) 
design_matrix <- model.matrix( ~Subject + conditionFactor)
ddsMat <- DESeq2::DESeqDataSetFromMatrix(countData=counts2, colData=cold, design=~conditionFactor);
colnames(ddsMat) <- colnames(counts2)
rld <- DESeq2::rlog(ddsMat) # LOG TRANSFORMED.
pca              <- prcomp(t(assay(rld)))
allPer.vec       <- 100 * summary(pca)$importance[2,]
allPer.text      <- paste(format(allPer.vec,digits=0, nsmall=1, scientific=FALSE), "%", sep='')
allPerLabels.vec <- paste(names(allPer.vec), "\n", allPer.text, sep='')

barplot(allPer.vec, main="PCA: % variance explained by each component\n" , ylim=c(0,100), names.arg=allPerLabels.vec)

d <- data.frame(pca$x, group=conditionFactor, Subject, name=colnames(rld)) # returns PC1, PC2, ... PC8... etc
ppp <- ggplot(d, aes_string(x="PC1", y="PC2", shape="Subject", color="group"))
ppp <- ppp + geom_point(size=6)
plot(ppp)

ppp <- ggplot(d, aes_string(x="PC1", y="PC2", shape="Subject", color="group"))
ppp <- ppp + geom_point(size=3) + geom_text(aes(label=name), nudge_y=6)
plot(ppp)

Samples seem to cluster by cell type. old S449 and S475 appear to be very similar. Replicates cluster together mostly.

Group/add up samples with technical replicates

#done manually using counts3 because I don't want to use matrix counts with 1 added to the value
counts3 = data.frame(
            counts2[, "FocusSC_USR17001380L_HJHCJBBXX_L6"]
          , counts2[, "FocusT_USR17001382L_HJHCJBBXX_L7"] + counts2[, "FocusT_USR17001382L_HKF5JBBXX_L1"]
          , counts2[, "FocusYT_USR17001383L_hkff7BBXX_L1"] + counts2[, "FocusYT_USR17001383L_HKY5KBBXX_L1"]
          , counts2[, "FocusY_USR17001381L_HJHCJBBXX_L6"]
          
          , counts2[, "PLCSC_USR17002043L_HJHCJBBXX_L7"]
          , counts2[, "PLCT_USR17002044L_HJHCJBBXX_L7"]
          , counts2[, "PLCYT_USR17001379L_HJHCJBBXX_L6"]
          , counts2[, "PLCY_USR17001377L_HJHCJBBXX_L6"]
          
          , counts2[, "S449SC_USR17001384L_HJHCJBBXX_L7"]
          , counts2[, "S449T_USR17001386L_HJHCJBBXX_L7"] 
          , counts2[, "S449YT_USR17002046L_HJHCJBBXX_L7"] + counts2[, "S449YT_USR17002046L_HKF5JBBXX_L3"]
          , counts2[, "S449Y_USR17001385L_HJHCJBBXX_L7"] + counts2[, "S449Y_USR17001385L_HKF5JBBXX_L3"]
          
          , counts2[, "S475SC_USR17001372L_HJHCJBBXX_L6"]
          , counts2[, "S475T_USR17001374L_HJHCJBBXX_L6"]
          , counts2[, "S475YT_USR17001375L_HJHCJBBXX_L6"]
          , counts2[, "S475Y_USR17001373L_HJHCJBBXX_L6"]
          
          , counts2[, "S449SC"]
          , counts2[, "S449T"]
          , counts2[, "S449Y"]
          , counts2[, "S449YT"]
          )
colnames(counts3)
##  [1] "counts2....FocusSC_USR17001380L_HJHCJBBXX_L6.."                                                 
##  [2] "counts2....FocusT_USR17001382L_HJHCJBBXX_L7.....counts2....FocusT_USR17001382L_HKF5JBBXX_L1.."  
##  [3] "counts2....FocusYT_USR17001383L_hkff7BBXX_L1.....counts2....FocusYT_USR17001383L_HKY5KBBXX_L1.."
##  [4] "counts2....FocusY_USR17001381L_HJHCJBBXX_L6.."                                                  
##  [5] "counts2....PLCSC_USR17002043L_HJHCJBBXX_L7.."                                                   
##  [6] "counts2....PLCT_USR17002044L_HJHCJBBXX_L7.."                                                    
##  [7] "counts2....PLCYT_USR17001379L_HJHCJBBXX_L6.."                                                   
##  [8] "counts2....PLCY_USR17001377L_HJHCJBBXX_L6.."                                                    
##  [9] "counts2....S449SC_USR17001384L_HJHCJBBXX_L7.."                                                  
## [10] "counts2....S449T_USR17001386L_HJHCJBBXX_L7.."                                                   
## [11] "counts2....S449YT_USR17002046L_HJHCJBBXX_L7.....counts2....S449YT_USR17002046L_HKF5JBBXX_L3.."  
## [12] "counts2....S449Y_USR17001385L_HJHCJBBXX_L7.....counts2....S449Y_USR17001385L_HKF5JBBXX_L3.."    
## [13] "counts2....S475SC_USR17001372L_HJHCJBBXX_L6.."                                                  
## [14] "counts2....S475T_USR17001374L_HJHCJBBXX_L6.."                                                   
## [15] "counts2....S475YT_USR17001375L_HJHCJBBXX_L6.."                                                  
## [16] "counts2....S475Y_USR17001373L_HJHCJBBXX_L6.."                                                   
## [17] "counts2....S449SC.."                                                                            
## [18] "counts2....S449T.."                                                                             
## [19] "counts2....S449Y.."                                                                             
## [20] "counts2....S449YT.."
colnames(counts3) = c(unlist(lapply (strsplit(sub("counts2....", "" ,colnames(counts3[,1:16])) , "_"), `[[`, 1)), sub("counts2.", "", colnames(counts3[, 17:20])))
colnames(counts3) = gsub("\\.", "", colnames(counts3))
rownames(counts3) = rownames(counts2)

Look at histogram of counts.

library(ggplot2)
library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:S4Vectors':
## 
##     expand
ggplot(gather((counts3[,1:4]), cols, value), aes(x = value)) + 
       geom_histogram(binwidth=100) + facet_grid(.~cols)

ggplot(gather((counts3[,5:8]), cols, value), aes(x = value)) + 
       geom_histogram(binwidth=100) + facet_grid(.~cols)

ggplot(gather((counts3[,9:12]), cols, value), aes(x = value)) + 
       geom_histogram(binwidth=100) + facet_grid(.~cols)

ggplot(gather((counts3[,13:16]), cols, value), aes(x = value)) + 
       geom_histogram(binwidth=100) + facet_grid(.~cols)

ggplot(gather((counts3[,17:20]), cols, value), aes(x = value)) + 
       geom_histogram(binwidth=100) + facet_grid(.~cols)

Setup up design matrix for Heatmap

Condition_heatmap = unlist( lapply (colnames(counts3), function(x) sub("S475", "", sub("S449", "", sub("PLC", "", sub("Focus", "", x))))) )
Condition_heatmap  <- factor(Condition_heatmap)
Subject_heatmap = unlist( lapply (colnames(counts3), function(x) sub("SC", "", sub("YT", "", sub("T", "", sub("Y", "", x))))) )
Subject_heatmap <- factor(Subject_heatmap)
cold <- data.frame("Condition_heatmap"=Condition_heatmap, "Subject_heatmap"=Subject_heatmap) #"sample_name"=colnames(counts4)
#design_matrix2 <- model.matrix( ~Subject2 + Condition)

FPKM after combinining samples that are “technical” replicates

colnames(counts3) = paste0(colnames(counts3), c(rep("_1", 16), rep("_2", 4)))
y_combined <- DGEList(counts=counts3)
fpkm_matrix = rpkm (y_combined, gene.length=counts1_gene_annotation[keep,"Length"], log = FALSE)

Heatmap to see how cells cluster by cell type or experimental condition. No replicates because they have been combined.

HEATMAP_HEIGHT <- 45 # inchescol
HEATMAP_WIDTH  <- 15
annotation <- data.frame(Expt = Condition_heatmap, Celltype=Subject_heatmap)
rownames(annotation) <- colnames(counts3)
pheatmap(fpkm_matrix,
         , width=HEATMAP_WIDTH, height=HEATMAP_HEIGHT
         , scale="row"  # actually scale each row
         , clustering_distance_rows="correlation" # or: euclidean
         , clustering_distance_cols="correlation"
         , clustering_method="complete"
         , display_numbers=FALSE
         , border_color="#00000000" # note: the last two digits here are the alpha channel/ transparency
         , show_rownames=FALSE, show_colnames = TRUE
         , annotation = annotation
         , fontsize_row=3.5)

This heatmap confirms the pattern seen in the dendrogram and PCA plot.

y_combined <- DGEList(counts=counts3)
fpkm_matrix = rpkm (y_combined, gene.length=counts1_gene_annotation[keep,"Length"], log = FALSE)
#wwtr1
fpkm_matrix_wwtr1 = fpkm_matrix[which(rownames(fpkm_matrix) == "ENSG00000018408"), ]
fpkm_matrix_wwtr1_percent = c(cbind(
    fpkm_matrix_wwtr1["FocusT_1"]/fpkm_matrix_wwtr1[ "FocusSC_1"]
  , fpkm_matrix_wwtr1["PLCT_1"]/fpkm_matrix_wwtr1[ "PLCSC_1"]
  , fpkm_matrix_wwtr1["S475T_1"]/fpkm_matrix_wwtr1[ "S475SC_1"]
  , fpkm_matrix_wwtr1["S449T_1"]/fpkm_matrix_wwtr1[ "S449SC_1"]
  , fpkm_matrix_wwtr1["S449T_2"]/fpkm_matrix_wwtr1[ "S449SC_2"]
  , fpkm_matrix_wwtr1["FocusYT_1"]/fpkm_matrix_wwtr1[ "FocusSC_1"]
  , fpkm_matrix_wwtr1["PLCYT_1"]/fpkm_matrix_wwtr1[ "PLCSC_1"]
  , fpkm_matrix_wwtr1["S475YT_1"]/fpkm_matrix_wwtr1[ "S475SC_1"]
  , fpkm_matrix_wwtr1["S449YT_1"]/fpkm_matrix_wwtr1[ "S449SC_1"] 
  , fpkm_matrix_wwtr1["S449YT_2"]/fpkm_matrix_wwtr1[ "S449SC_2"] ))
names(fpkm_matrix_wwtr1_percent ) = c("FocusT/SC", "PLCT/SC", "S475T/SC", "S449T/SC", "S449T_2/SC_2", "FocusYT/SC", "PLCYT/SC", "S475YT/SC", "S449YT/SC", "S449YT_2/SC_2")

#barplot(fpkm_matrix_wwtr1, las=2)
barplot(fpkm_matrix_wwtr1_percent, las=2, cex.names=.7, ylab= "percent KD", main="wwtr1 based on fpkm values")

#yap1
fpkm_matrix_yap1 = fpkm_matrix[which(rownames(fpkm_matrix) == "ENSG00000137693"), ]
fpkm_matrix_yap1_percent = c(cbind(
    fpkm_matrix_yap1["FocusY_1"]/fpkm_matrix_yap1[ "FocusSC_1"]
  , fpkm_matrix_yap1["PLCY_1"]/fpkm_matrix_yap1[ "PLCSC_1"]
  , fpkm_matrix_yap1["S475Y_1"]/fpkm_matrix_yap1[ "S475SC_1"]
  , fpkm_matrix_yap1["S449Y_1"]/fpkm_matrix_yap1[ "S449SC_1"]
  , fpkm_matrix_yap1["S449Y_2"]/fpkm_matrix_yap1[ "S449SC_2"]
  , fpkm_matrix_yap1["FocusYT_1"]/fpkm_matrix_yap1[ "FocusSC_1"]
  , fpkm_matrix_yap1["PLCYT_1"]/fpkm_matrix_yap1[ "PLCSC_1"]
  , fpkm_matrix_yap1["S475YT_1"]/fpkm_matrix_yap1[ "S475SC_1"]
  , fpkm_matrix_yap1["S449YT_1"]/fpkm_matrix_yap1[ "S449SC_1"]
  , fpkm_matrix_yap1["S449YT_2"]/fpkm_matrix_yap1[ "S449SC_2"] ))
names(fpkm_matrix_yap1_percent ) = c("FocusY/SC", "PLCY/SC", "S475Y/SC", "S449Y/SC", "S449Y_2/SC_2",  "FocusYT/SC", "PLCYT/SC", "S475YT/SC", "S449YT/SC", "S449YT_2/SC_2")
#barplot(fpkm_matrix_yap1, las=2)
barplot(fpkm_matrix_yap1_percent, las=2,  cex.names=.7, ylab= "percent KD", main = "yap1 based on fpkm values")

S449Y_2 appears to have worked! Unfortunately, S447T_2 appears to have failed. It will be more difficult to justify removing samples that didn’t knock down its target.

Remove old S449 samples from matrix before fitting a model. Fit the model and estimate dispersion value.

counts4 = counts3[, c("FocusSC_1", "FocusT_1", "FocusYT_1", "FocusY_1", "PLCSC_1", "PLCT_1", "PLCYT_1", "PLCY_1", "S475SC_1", "S475T_1", "S475YT_1", "S475Y_1", "S449SC_2", "S449T_2", "S449Y_2", "S449YT_2" )]

Setup up design matrix for Heatmap

Condition_DE_ = unlist( lapply (matrix(unlist(strsplit(colnames(counts4), "_")), byrow=T, ncol=2)[,1], function(x) sub("S475", "", sub("S449", "", sub("PLC", "", sub("Focus", "", x))))) )
Condition_DE_  <- factor(Condition_DE_)
Subject_DE_ = unlist( lapply (matrix(unlist(strsplit(colnames(counts4), "_")), byrow=T, ncol=2)[,1], function(x) sub("SC", "", sub("YT", "", sub("T", "", sub("Y", "", x))))) )
Subject_DE_ <- factor(Subject_DE_)
design_matrix_DE <- model.matrix( ~Subject_DE_ + Condition_DE_)
colnames(design_matrix_DE)
## [1] "(Intercept)"     "Subject_DE_PLC"  "Subject_DE_S449" "Subject_DE_S475"
## [5] "Condition_DE_T"  "Condition_DE_Y"  "Condition_DE_YT"
y_for_DE <- DGEList(counts=counts4)
y_for_DE <- calcNormFactors(y_for_DE)
y_for_DE = estimateDisp(y_for_DE, design=design_matrix_DE) 
plotBCV(y_for_DE)

fit <- glmFit(y_for_DE, design=design_matrix_DE, dispersion=y_for_DE$trended.dispersion)

Run differential expression SC vs YT

lrt_T <- glmLRT(fit,coef = "Condition_DE_T")
topTags(lrt_T)
## Coefficient:  Condition_DE_T 
##                     logFC   logCPM       LR       PValue          FDR
## ENSG00000099974 -2.289454 5.864982 54.14714 1.860245e-13 3.982041e-09
## ENSG00000099977 -2.234506 5.895525 51.98310 5.598031e-13 5.991573e-09
## ENSG00000241313 -2.271751 3.424852 36.54100 1.494884e-09 1.066650e-05
## ENSG00000212724 -2.110529 5.695629 31.15033 2.387994e-08 1.277935e-04
## ENSG00000159055 -1.797491 4.440277 29.81984 4.741147e-08 2.029780e-04
## ENSG00000108821  1.611116 6.720624 27.96323 1.236429e-07 4.411166e-04
## ENSG00000138463 -1.596078 4.540343 24.86459 6.150176e-07 1.880724e-03
## ENSG00000181773 -2.166823 1.490235 21.50004 3.538213e-06 9.467374e-03
## ENSG00000189068 -2.761361 1.025508 21.25495 4.020700e-06 9.563012e-03
## ENSG00000150867  1.337408 6.370193 20.95854 4.693304e-06 1.004649e-02
smoothScatter(x=lrt_T$table$logFC, y=-log10(lrt_T$table$PValue))

lrt_T_table = data.frame(lrt_T$table, p.adjust(lrt_T$table$PValue, method = "BH"))
colnames(lrt_T_table) = c(colnames(lrt_T$table), "adjPVal_BH")
hist(lrt_T_table$PValue)

SC vs Y

lrt_Y <- glmLRT(fit,coef = "Condition_DE_Y")
topTags(lrt_Y)
## Coefficient:  Condition_DE_Y 
##                     logFC   logCPM        LR       PValue          FDR
## ENSG00000137693 -3.427520 6.901856 113.67506 1.535119e-26 3.286075e-22
## ENSG00000254422 -3.857873 4.607185 112.14306 3.324356e-26 3.558058e-22
## ENSG00000128564  3.274088 5.202611  86.56695 1.350859e-20 9.638832e-17
## ENSG00000267881 -6.965713 4.252158  69.53243 7.516840e-17 4.022637e-13
## ENSG00000275266  5.996143 4.363370  66.99688 2.719377e-16 1.164220e-12
## ENSG00000280390 -6.141730 4.886014  65.63278 5.432750e-16 1.938224e-12
## ENSG00000086548 -5.532175 5.913691  64.79711 8.302091e-16 2.538780e-12
## ENSG00000284010  5.897919 3.663154  54.15954 1.848543e-13 4.792689e-10
## ENSG00000162704 -2.194228 7.198675  53.99007 2.015052e-13 4.792689e-10
## ENSG00000157617 -2.071285 6.297929  47.53385 5.406245e-12 1.157261e-08
smoothScatter(x=lrt_Y$table$logFC, y=-log10(lrt_Y$table$PValue))

lrt_Y_table = data.frame(lrt_Y$table, p.adjust(lrt_Y$table$PValue, method = "BH"))
colnames(lrt_Y_table) = c(colnames(lrt_Y$table), "adjPVal_BH")
hist(lrt_Y_table$PValue)

SC vs YT

lrt_YT <- glmLRT(fit,coef = "Condition_DE_YT")
topTags(lrt_YT)
## Coefficient:  Condition_DE_YT 
##                     logFC   logCPM        LR       PValue          FDR
## ENSG00000137693 -3.424404 6.901856 112.79200 2.396404e-26 5.129742e-22
## ENSG00000212724 -4.123537 5.695629 104.47966 1.588158e-24 1.699805e-20
## ENSG00000254422 -3.503055 4.607185  93.56799 3.924837e-22 2.800502e-18
## ENSG00000267881 -8.711860 4.252158  84.13972 4.610157e-20 2.467125e-16
## ENSG00000128564  3.119758 5.202611  80.29383 3.226752e-19 1.381437e-15
## ENSG00000086548 -6.281735 5.913691  76.37045 2.351439e-18 8.389151e-15
## ENSG00000213417 -3.827566 4.556583  73.78900 8.692954e-18 2.658305e-14
## ENSG00000214518 -3.842997 4.370185  70.96220 3.641359e-17 9.743366e-14
## ENSG00000212725 -3.865983 4.073299  67.10244 2.577594e-16 6.130663e-13
## ENSG00000181444  2.908314 4.667671  64.75289 8.490495e-16 1.817475e-12
smoothScatter(x=lrt_YT$table$logFC, y=-log10(lrt_YT$table$PValue))

lrt_YT_table = data.frame(lrt_YT$table, p.adjust(lrt_YT$table$PValue, method = "BH"))
colnames(lrt_YT_table) = c(colnames(lrt_YT$table), "adjPVal_BH")
hist(lrt_YT_table$PValue)

Format output into excel sheet.

genes = rownames(counts4)
ensembl = useEnsembl(biomart="ensembl", dataset="hsapiens_gene_ensembl")
head(listAttributes(ensembl), 30)
##                               name
## 1                  ensembl_gene_id
## 2          ensembl_gene_id_version
## 3            ensembl_transcript_id
## 4    ensembl_transcript_id_version
## 5               ensembl_peptide_id
## 6       ensembl_peptide_id_version
## 7                  ensembl_exon_id
## 8                      description
## 9                  chromosome_name
## 10                  start_position
## 11                    end_position
## 12                          strand
## 13                            band
## 14                transcript_start
## 15                  transcript_end
## 16        transcription_start_site
## 17               transcript_length
## 18                  transcript_tsl
## 19        transcript_gencode_basic
## 20               transcript_appris
## 21              external_gene_name
## 22            external_gene_source
## 23        external_transcript_name
## 24 external_transcript_source_name
## 25                transcript_count
## 26      percentage_gene_gc_content
## 27                    gene_biotype
## 28              transcript_biotype
## 29                          source
## 30               transcript_source
##                                   description         page
## 1                              Gene stable ID feature_page
## 2                      Gene stable ID version feature_page
## 3                        Transcript stable ID feature_page
## 4                Transcript stable ID version feature_page
## 5                           Protein stable ID feature_page
## 6                   Protein stable ID version feature_page
## 7                              Exon stable ID feature_page
## 8                            Gene description feature_page
## 9                    Chromosome/scaffold name feature_page
## 10                            Gene start (bp) feature_page
## 11                              Gene end (bp) feature_page
## 12                                     Strand feature_page
## 13                             Karyotype band feature_page
## 14                      Transcript start (bp) feature_page
## 15                        Transcript end (bp) feature_page
## 16             Transcription start site (TSS) feature_page
## 17 Transcript length (including UTRs and CDS) feature_page
## 18             Transcript support level (TSL) feature_page
## 19                   GENCODE basic annotation feature_page
## 20                          APPRIS annotation feature_page
## 21                                  Gene name feature_page
## 22                        Source of gene name feature_page
## 23                            Transcript name feature_page
## 24                  Source of transcript name feature_page
## 25                           Transcript count feature_page
## 26                          Gene % GC content feature_page
## 27                                  Gene type feature_page
## 28                            Transcript type feature_page
## 29                              Source (gene) feature_page
## 30                        Source (transcript) feature_page
hgnc_swissprot <- getBM(attributes=c('ensembl_gene_id','hgnc_symbol','description'),filters = 'ensembl_gene_id',  values = genes , mart = ensembl)

fpkm_log_matrix2 = rpkm (y_for_DE, gene.length=counts1_gene_annotation[keep,"Length"], log = FALSE)
index = match (genes, hgnc_swissprot$ensembl_gene_id)
results = data.frame(hgnc_swissprot[index,], counts4, fpkm_log_matrix2, lrt_T_table, lrt_Y_table, lrt_YT_table)
colnames(results) = c(
                    colnames(hgnc_swissprot[index,])
                  , unlist(lapply (colnames(counts4), function(x) paste(x, "counts", sep="_")))
                  , unlist(lapply (colnames(fpkm_log_matrix2), function(x) paste(x, "fpkm", sep="_")))
                  , unlist(lapply (colnames(lrt_T_table), function(x) paste(x, "_SC_vs_T", sep="_")))
                  , unlist(lapply (colnames(lrt_Y_table), function(x) paste(x, "_SC_vs_Y", sep="_")))
                  , unlist(lapply (colnames(lrt_YT_table), function(x) paste(x, "_SC_vs_YT", sep="_"))) )
curr_date = format(Sys.time(), "%a_%b_%d_%H_hrs_%M_min_%S_sec_%Y")
write.table(results, paste0("~/Downloads/Chen_", curr_date, ".xls"), row.names=F, col.names=T, sep="\t")

Heatmap of DE genes.

pheatmap(fpkm_log_matrix2[order(lrt_T_table$PValue, decreasing = F)[1:20],c(colnames(fpkm_log_matrix2)[grep("SC", colnames(fpkm_log_matrix2))],"FocusT_1", "PLCT_1", "S475T_1", "S449T_2")], cluster_rows=FALSE, cluster_cols=FALSE)

pheatmap(fpkm_log_matrix2[order(lrt_Y_table$PValue, decreasing = F)[1:20],c(colnames(fpkm_log_matrix2)[grep("SC", colnames(fpkm_log_matrix2))],"FocusY_1", "PLCY_1", "S475Y_1", "S449Y_2")], cluster_rows=FALSE, cluster_cols=FALSE)

pheatmap(fpkm_log_matrix2[order(lrt_YT_table$PValue, decreasing = F)[1:20],c(colnames(fpkm_log_matrix2)[grep("SC", colnames(fpkm_log_matrix2))],colnames(fpkm_log_matrix2)[grep("YT", colnames(fpkm_log_matrix2))] )], cluster_rows=FALSE, cluster_cols=FALSE)

#try this
mean_values = (fit$coefficients %*% t(design_matrix_DE))
colnames(mean_values) = colnames(counts4)
rownames(mean_values) = rownames(counts4)

pheatmap(mean_values[order(lrt_T_table$PValue, decreasing = F)[1:20],c(colnames(fpkm_log_matrix2)[grep("SC", colnames(fpkm_log_matrix2))],"FocusT_1", "PLCT_1", "S475T_1", "S449T_2")], cluster_rows=FALSE, cluster_cols=FALSE)

pheatmap(mean_values[order(lrt_Y_table$PValue, decreasing = F)[1:20],c(colnames(fpkm_log_matrix2)[grep("SC", colnames(fpkm_log_matrix2))],"FocusY_1", "PLCY_1", "S475Y_1", "S449Y_2")], cluster_rows=FALSE, cluster_cols=FALSE)

pheatmap(mean_values[order(lrt_YT_table$PValue, decreasing = F)[1:20],c(colnames(fpkm_log_matrix2)[grep("SC", colnames(fpkm_log_matrix2))],colnames(fpkm_log_matrix2)[grep("YT", colnames(fpkm_log_matrix2))] )], cluster_rows=FALSE, cluster_cols=TRUE)

#edit column description
write.table(colnames(results), "~/Downloads/column_description.txt", quote=F, col.names=F, row.names=F,sep="\t")